knitr::opts_chunk$set(echo = TRUE)
# install.packages("Hmisc")
# install.packages("pastecs")
# install.packages("ggplot2")
# install.packages("Hmisc")
# install.packages("fastDummies")
# install.packages("lmtest")
# install.packages("lmtest")
# install.packages("caretEnsemble")
# install.packages("Amelia")
# install.packages("GGally")
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(leaps)
library(reshape2)
library(gridExtra)
library(fastDummies)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(pastecs)
library(skimr)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.6 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ tidyr::extract() masks pastecs::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks pastecs::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks pastecs::last()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(caret)
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
##
## The following object is masked from 'package:ggplot2':
##
## autoplot
library(psych)
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(rpart)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:gridExtra':
##
## combine
##
## The following object is masked from 'package:psych':
##
## outlier
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(nnet)
library(ROCR)
library(Metrics)
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
library(caret)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
##
## The following object is masked from 'package:Metrics':
##
## accuracy
##
## The following object is masked from 'package:caretEnsemble':
##
## autoplot
library(rpart)
library(rattle)
## Loading required package: bitops
##
## Attaching package: 'bitops'
##
## The following object is masked from 'package:Matrix':
##
## %&%
##
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
##
## The following object is masked from 'package:randomForest':
##
## importance
library(ggplot2)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:purrr':
##
## compact
library(rlist)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following object is masked from 'package:Metrics':
##
## auc
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ROSE)
## Loaded ROSE 0.0-4
library(plotly)
##
## Attaching package: 'plotly'
##
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(rattle)
library(rpart.plot)
library(RColorBrewer)
# reading file
df = readr::read_csv("caravan-insurance-challenge.csv")
## Rows: 9822 Columns: 87
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ORIGIN
## dbl (86): MOSTYPE, MAANTHUI, MGEMOMV, MGEMLEEF, MOSHOOFD, MGODRK, MGODPR, MG...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dimension = dim(df)
# total number of observations are 9822
# total number of variables are 87
totalRows = dimension[1]
# data type of 86 variables is number and 1 variable is char, our target variable is number of policies bought remaining variables are categorical numeric.
head(df)
## # A tibble: 6 × 87
## ORIGIN MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 train 33 1 3 2 8 0 5 1 3
## 2 train 37 1 2 2 8 1 4 1 4
## 3 train 37 1 2 2 8 0 4 2 4
## 4 train 9 1 3 3 3 2 3 2 4
## 5 train 40 1 4 2 10 1 4 1 4
## 6 train 23 1 2 1 5 0 5 0 5
## # … with 77 more variables: MRELGE <dbl>, MRELSA <dbl>, MRELOV <dbl>,
## # MFALLEEN <dbl>, MFGEKIND <dbl>, MFWEKIND <dbl>, MOPLHOOG <dbl>,
## # MOPLMIDD <dbl>, MOPLLAAG <dbl>, MBERHOOG <dbl>, MBERZELF <dbl>,
## # MBERBOER <dbl>, MBERMIDD <dbl>, MBERARBG <dbl>, MBERARBO <dbl>, MSKA <dbl>,
## # MSKB1 <dbl>, MSKB2 <dbl>, MSKC <dbl>, MSKD <dbl>, MHHUUR <dbl>,
## # MHKOOP <dbl>, MAUT1 <dbl>, MAUT2 <dbl>, MAUT0 <dbl>, MZFONDS <dbl>,
## # MZPART <dbl>, MINKM30 <dbl>, MINK3045 <dbl>, MINK4575 <dbl>, …
# explore top 5 rows
# summary of main data set
# stat.desc(df)
# summary(df)
# missing values in main data set
paste0("Total missing values:", sum(is.na(df)))
## [1] "Total missing values:0"
desc = skim(df)
unInsuredRows = df[df$CARAVAN == 0,]
insuredRows = df[df$CARAVAN == 1,]
peopleNotInsuredWithCaravan = nrow(unInsuredRows);
peopleInsuredWithCaravan = nrow(insuredRows);
ratioOfPeopleWhoDontBought = peopleNotInsuredWithCaravan/totalRows * 100
ratioOfPeopleWhoBought = peopleInsuredWithCaravan/totalRows * 100
# 5.96% who bought insurance vs 94% who didn't buy the insurance, by statistics it looks like company is in loss people are not interested in buying they have to something to improve.
dat <- data.frame(
policy_status = factor(c("Not Insured","Insured"), levels=c("Not Insured","Insured")),
Count = c( peopleNotInsuredWithCaravan , peopleInsuredWithCaravan)
)
ggplot(data=dat, aes(x=policy_status, y=Count, fill=policy_status)) +
geom_bar(colour="red", stat="identity")
ggplot(df,aes(factor(df$MOSHOOFD))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
labs(x="Customer Main type") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Caravan Policy based on Customer Main Type") +
theme(plot.title = element_text(hjust = 0.5))
df$maintype = df$MOSHOOFD
nrow(df[df$maintype == 1 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 2 & df$CARAVAN == 1,])
## [1] 103
nrow(df[df$maintype == 3 & df$CARAVAN == 1,])
## [1] 109
nrow(df[df$maintype == 4 & df$CARAVAN == 1,])
## [1] 0
nrow(df[df$maintype == 5 & df$CARAVAN == 1,])
## [1] 18
nrow(df[df$maintype == 6 & df$CARAVAN == 1,])
## [1] 9
nrow(df[df$maintype == 7 & df$CARAVAN == 1,])
## [1] 35
nrow(df[df$maintype == 8 & df$CARAVAN == 1,])
## [1] 151
nrow(df[df$maintype == 9 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 10 & df$CARAVAN == 1,])
## [1] 11
# Here we wanted to see the which customer main type has the highest frequency/count of buying the insurance. Based on results, we see that there are atleast 4 main customer categories that buy insurance. However, for our ease and understanding purposes we will only consider the top 2. This brings us to select, category number 8 and 3, where 8 = Family with grown ups and 3 = Driven Growths
ggplot(df,aes(factor(df$MOSTYPE))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
labs(x="Customer Sub type") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Policy Bought based on Customer sub Type") +
theme(plot.title = element_text(hjust = 0.5))
df$subtype = df$MOSTYPE
nrow(df[df$subtype == 33 & df$CARAVAN == 1,])
## [1] 80
nrow(df[df$subtype == 8 & df$CARAVAN == 1,])
## [1] 72
# category 33 and 8 purchased more policies.
# Based on our main category which compromises of various sub-categories, we can see that sub-categories number 33 and 8 are porne to buying insurance. These should be considered as the characteristics/attributes of the types of customers that exist in the main customer category. hence, we could say that, those who are middle class and those who are low class but have large families have a higher chance of getting the insurance
ggplot(df,aes(factor(df$MGEMLEEF))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
geom_text(stat='count', aes(label=..count..), vjust=0) +
labs(x="Age Group") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Policy bought on age group") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
df$age = df$MGEMLEEF
nrow(df[df$age == 1 & df$CARAVAN == 1,])
## [1] 1
nrow(df[df$age == 2 & df$CARAVAN == 1,])
## [1] 156
nrow(df[df$age == 3 & df$CARAVAN == 1,])
## [1] 303
nrow(df[df$age == 4 & df$CARAVAN == 1,])
## [1] 105
nrow(df[df$age == 5 & df$CARAVAN == 1,])
## [1] 20
nrow(df[df$age == 6 & df$CARAVAN == 1,])
## [1] 1
# Here we have explored to see what is the age range of the customers that buy the insurance. Based on our analysis we see that customers who are between the ages 40-50 are prone to buying insurance compared with others. Hence, we could say that it is among the many characteristics of the main customer group [3,8]
ggplot(df,aes(factor(df$MAANTHUI))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
geom_text(stat='count', aes(label=..count..), vjust=0) +
labs(x="Number of houses") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Number of houses customer has who bought insurance") +
theme(plot.title = element_text(hjust = 0.5))
df$noofhouses = df$MAANTHUI
nrow(df[df$noofhouses == 1 & df$CARAVAN == 1,])
## [1] 526
nrow(df[df$noofhouses == 2 & df$CARAVAN == 1,])
## [1] 59
nrow(df[df$noofhouses == 3 & df$CARAVAN == 1,])
## [1] 1
# Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.
ggplot(df,aes(factor(df$MGEMOMV))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
geom_text(stat='count', aes(label=..count..), vjust=0) +
labs(x="Number of house hold") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Number of house hold") +
theme(plot.title = element_text(hjust = 0.5))
df$noofhousehold = df$MGEMOMV
nrow(df[df$noofhousehold == 1 & df$CARAVAN == 1,])
## [1] 11
nrow(df[df$noofhousehold == 2 & df$CARAVAN == 1,])
## [1] 195
nrow(df[df$noofhousehold == 3 & df$CARAVAN == 1,])
## [1] 275
df$hasThreeHouseHold = ifelse(df$noofhousehold == 3, 1, 0)
# Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.
corrplot(cor(df[, c("subtype","maintype", "age", "noofhouses", "noofhousehold", "CARAVAN")]), method = "number")
# From the correlation matrix we have below, we have some interesting insights. The reason to run the correlation matrix was to figure out variables of interest which might cause either overfitting or underfitting. # Here we can see that the variables of interest are positively correlated. One exception to this is the correlation between age and number of households. We see that there is a negative correlation between it. Which actually makes sense because, the greater the age, the no of households will decrease.
table(df$MAANTHUI)
##
## 1 2 3 4 5 6 7 8 10
## 8915 821 64 4 3 3 8 2 2
df$MAANTHUI = replace(df$MAANTHUI, df$MAANTHUI > 2, 2)
df$OneHouse = ifelse(df$MAANTHUI ==1, 1, 0)
df$moreThanTwoHouse = ifelse(df$MAANTHUI > 2, 1, 0)
# by looking frequencies of houses we categorized houses into dummies.
df$averageFamily = ifelse(df$MOSTYPE %in% c(12,11,9,10,13), 1, 0)
df$loners = ifelse(df$MOSTYPE %in% c(17,15,18,16,19), 1, 0)
df$conservativeFamilies = ifelse(df$MOSTYPE %in% c(39,38), 1, 0)
df$crusingSeniors = ifelse(df$MOSTYPE %in% c(26,25,28,27), 1, 0)
df$drivenGrowers = ifelse(df$MOSTYPE %in% c(6,7,8), 1, 0)
df$grownups = ifelse(df$MOSTYPE %in% c(33,34,35,36,37), 1, 0)
df$framers = ifelse(df$MOSTYPE %in% c(40,41), 1, 0)
df$livingWell = ifelse(df$MOSTYPE %in% c(20,21,22,23,24), 1, 0)
df$retired = ifelse(df$MOSTYPE %in% c(29,30,31,32), 1, 0)
df$successful = ifelse(df$MOSTYPE %in% c(1,2,3,4,5), 1, 0)
dat <- data.frame(
Categorized_Customers = factor(c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful"), levels=c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful")),
Count = c( sum(df$averageFamily), sum(df$loners), sum(df$conservativeFamilies), sum(df$crusingSeniors), sum(df$drivenGrowers), sum(df$grownups), sum(df$framers), sum(df$livingWell), sum(df$retired), sum(df$successful) )
)
ggplot(data=dat, aes(x=Categorized_Customers, y=Count, fill=Categorized_Customers)) +
geom_bar(colour="red", stat="identity")
# Converting 30k income into value
df$MINKM30_c = ifelse(df$MINKM30 == 1, 0.05 * 30000, df$MINKM30)
df$MINKM30_c = ifelse(df$MINKM30_c == 2, 0.17 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 3, 0.3 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 4, 0.43 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 5, 0.56 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 6, 0.69 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 7, 0.82 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 8, 0.94 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 9, 1 * 30000, df$MINKM30_c)
# Converting 45k income into value
df$MINK3045_c = ifelse(df$MINK3045 == 1, 0.05 * 45000, df$MINK3045)
df$MINK3045_c = ifelse(df$MINK3045_c == 2, 0.17 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 3, 0.3 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 4, 0.43 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 5, 0.56 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 6, 0.69 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 7, 0.82 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 8, 0.94 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 9, 1 * 45000, df$MINK3045_c)
# Converting 70k income into value
df$MINK4575_c = ifelse(df$MINK4575 == 1, 0.05 * 75000, df$MINK4575)
df$MINK4575_c = ifelse(df$MINK4575_c == 2, 0.17 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 3, 0.3 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 4, 0.43 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 5, 0.56 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 6, 0.69 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 7, 0.82 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 8, 0.94 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 9, 1 * 75000, df$MINK4575_c)
# Converting 122k income into value
df$MINK7512_c = ifelse(df$MINK7512 == 1, 0.05 * 122000, df$MINK7512)
df$MINK7512_c = ifelse(df$MINK7512_c == 2, 0.17 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 3, 0.3 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 4, 0.43 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 5, 0.56 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 6, 0.69 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 7, 0.82 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 8, 0.94 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 9, 1 * 122000, df$MINK7512_c)
# Converting 123k income into value
df$MINK123M_c = ifelse(df$MINK123M == 1, 0.05 * 123000, df$MINK123M)
df$MINK123M_c = ifelse(df$MINK123M_c == 2, 0.17 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 3, 0.3 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 4, 0.43 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 5, 0.56 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 6, 0.69 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 7, 0.82 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 8, 0.94 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 9, 1 * 123000, df$MINK123M_c)
# Average income
df$MINKGEM_c = (df$MINK123M_c + df$MINK7512_c + df$MINK4575_c + df$MINK3045_c + df$MINKM30_c)/5
hist(df$MINKGEM_c, main ="Income in numerical", xlab ="Income")
df$MGEMLEEF_c = ifelse(df$MGEMLEEF == 1, 25, df$MGEMLEEF)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 2, 35, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 3, 45, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 55, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 5, 65, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 75, df$MGEMLEEF_c)
# MOSTYPE : customer subtype
# MFWEKIND: Household with children
# MOPLLAAG: Lower level education
# MHHUUR : Rented house
# MHKOOP : Home owners
# MINKM30: Income < 30.000 low income
# MINK7512: Income 75-122.000 high income
# MKOOPKLA: Purchasing power class
# PPERSAUT: Contribution car policies
# CARAVAN: Number of mobile home policies 0 - 1
plot_ly(x = ~df$MINKM30_c, y = ~df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
# After having converted the categorical variable for income to numerical, here we take income less than 3000 against customer sub-categories and draw a box plot. From the below box plot, we can see that as soon as income crosses 10K, we begint to see a few outliers for certain sub-categories. Another important thing to note is that when income jumps above 20K, the distance of outliers starts to increase. We could say that we are seeing extreme outliers in income levels ranging from 25K to 30K.
# Now the question is if we should keep outliers in our analysis, whether mild or extreme or delete both of them and then proceed?
plot_ly(y = ~df$MOSTYPE, x = ~df$MINK3045_c, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
plot_ly(x = df$MINK4575_c, y = df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
prop.table(table(df$CARAVAN))
##
## 0 1
## 0.94033802 0.05966198
# 94% people didn't buy insurance and only 5% bought. under sampling problem
barplot(prop.table(table(df$CARAVAN)), col = rainbow(2), ylim = c(0,0.7), main = "Class Distribution")
# training data
df_train = (df[df$ORIGIN == "train",])
df_train = (df_train[,-1])
table(df_train$CARAVAN)
##
## 0 1
## 5474 348
#testing data
df_test = (df[df$ORIGIN == "test",])
df_test = (df_test[,-1])
nrow(df_test)
## [1] 4000
table(df_test$CARAVAN)
##
## 0 1
## 3762 238
over_train = ovun.sample(CARAVAN ~ ., data =df_train, method = "over", N =10948)$data
table(over_train$CARAVAN)
##
## 0 1
## 5474 5474
over_test = ovun.sample(CARAVAN ~ ., data =df_test, method = "over", N =nrow(df_test))$data
table(over_test$CARAVAN)
##
## 0 1
## 3762 238
# we are not fixing sampling problem in test data, i did this becuase i was having error with doing prediction
for(i in 1:ncol(over_train)){
over_train[,i] <- as.factor(over_train[,i])
}
for(i in 1:ncol(over_test)){
over_test[,i] <- as.factor(over_test[,i])
}
over_test$MINKGEM_c = as.numeric(over_test$MINKGEM_c)
over_train$MINKGEM_c = as.numeric(over_train$MINKGEM_c)
over_test$MGEMLEEF_c = as.numeric(over_test$MGEMLEEF_c)
over_train$MGEMLEEF_c = as.numeric(over_train$MGEMLEEF_c)
drewSummary = function(model) {
summary(model)
}
drewMatrix = function(model, test_data) {
predicted = predict(model, test_data, type = "response")
predictedClass = ifelse(predicted>=0.5, 1, 0)
confusionMatrix(as.factor(predictedClass), as.factor(test_data$CARAVAN), positive = "1")
}
drewAnova = function(model1, model2){
anova(model1, model2, test = 'Chisq')
}
drewROC = function(model){
predicted = predict(model, over_test, type = "response")
predictedClass = ifelse(predicted>=0.5, 1, 0)
r = roc(over_test$CARAVAN, predictedClass)
plot.roc(r)
}
getRMSE = function(predictedClass){
accuracy(predictedClass, as.numeric(over_test$CARAVAN))[2]
}
new_data = over_train
new_data$MOSHOOFD = as.numeric(new_data$MOSHOOFD)
new_data$MGEMOMV = as.numeric(new_data$MGEMOMV)
new_data$MINKGEM = as.numeric(new_data$MINKGEM)
new_data$MGEMLEEF = as.numeric(new_data$MGEMLEEF)
new_data$CARAVAN = as.numeric(new_data$CARAVAN)
new_data$OneHouse = as.numeric(new_data$OneHouse)
corrplot(cor(subset(new_data , select = c("MOSHOOFD", "MGEMOMV", "OneHouse", "MINKGEM", "MGEMLEEF", "CARAVAN"))), method = "number", type = "upper")
set.seed(123)
logit.reg = glm(CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c+MGEMLEEF_c, data = over_train, family = binomial (link = "logit"))
logit.reg$xlevels[["MGEMOMV"]] <- union(logit.reg$xlevels[["MGEMOMV"]], levels(over_test$MGEMOMV))
drewSummary(logit.reg)
##
## Call:
## glm(formula = CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c +
## MGEMLEEF_c, family = binomial(link = "logit"), data = over_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7121 -1.1587 0.3691 1.1464 1.9105
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.309e-01 2.067e-01 -3.052 0.00227 **
## MOSHOOFD2 4.785e-01 8.530e-02 5.609 2.04e-08 ***
## MOSHOOFD3 -1.825e-01 8.124e-02 -2.247 0.02465 *
## MOSHOOFD4 -1.460e+01 1.212e+02 -0.120 0.90409
## MOSHOOFD5 -1.046e+00 1.077e-01 -9.709 < 2e-16 ***
## MOSHOOFD6 -1.366e+00 1.620e-01 -8.432 < 2e-16 ***
## MOSHOOFD7 -7.616e-01 9.582e-02 -7.949 1.89e-15 ***
## MOSHOOFD8 -3.133e-01 7.217e-02 -4.341 1.42e-05 ***
## MOSHOOFD9 -2.058e-01 8.669e-02 -2.374 0.01760 *
## MOSHOOFD10 -1.325e+00 1.333e-01 -9.935 < 2e-16 ***
## MGEMOMV2 2.640e-01 1.292e-01 2.044 0.04096 *
## MGEMOMV3 2.890e-01 1.322e-01 2.186 0.02878 *
## MGEMOMV4 3.228e-01 1.433e-01 2.252 0.02431 *
## MGEMOMV5 2.260e-01 2.216e-01 1.020 0.30795
## OneHouse1 6.377e-02 7.015e-02 0.909 0.36328
## MINKGEM_c 1.576e-03 2.624e-04 6.004 1.92e-09 ***
## MGEMLEEF_c 9.684e-02 3.096e-02 3.128 0.00176 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177 on 10947 degrees of freedom
## Residual deviance: 14376 on 10931 degrees of freedom
## AIC: 14410
##
## Number of Fisher Scoring iterations: 13
drewMatrix(logit.reg, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 221 3
## 1 3541 235
##
## Accuracy : 0.114
## 95% CI : (0.1043, 0.1243)
## No Information Rate : 0.9405
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0058
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.98739
## Specificity : 0.05875
## Pos Pred Value : 0.06224
## Neg Pred Value : 0.98661
## Prevalence : 0.05950
## Detection Rate : 0.05875
## Detection Prevalence : 0.94400
## Balanced Accuracy : 0.52307
##
## 'Positive' Class : 1
##
drewROC(logit.reg)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# getRMSE(logit.reg)
# drewAnova(logit.reg)
# we did regression with/out house we see a minimal affect i.e with house model is predicting 60 true positives without it's predicting 65 and it's not significant, so we decided not to include this variable.
# difference in deviance = Null deviance (15177) - 14398 = 872
train_2 = over_train
train_2 = subset(over_train, select = -c(maintype,subtype,age,noofhouses,noofhousehold,hasThreeHouseHold,OneHouse,moreThanTwoHouse,averageFamily,loners,conservativeFamilies,crusingSeniors,drivenGrowers,grownups,framers,livingWell,retired,successful,MINKM30_c,MINK3045_c,MINK4575_c,MINK7512_c,MINK123M_c,MINKGEM_c,MGEMLEEF_c,MOSHOOFD,MGEMOMV,MAANTHUI,MINKGEM,MGEMLEEF) )
length(train_2)
## [1] 81
Droping 5 variables that we have used in first regression. and dummies we have created earlier.
new_df = train_2
for(i in 1:ncol(new_df)){
new_df[,i] <- as.integer(new_df[,i])
}
zv = apply(new_df, 2, function(x) length(unique(x)) == 1)
dfr = new_df[, !zv]
n=length(colnames(dfr))
correlationMatrix = cor(dfr[,1:n],use="complete.obs")
summary(correlationMatrix[upper.tri(correlationMatrix)])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.999761 -0.024534 -0.001756 0.010891 0.037462 0.983494
# After removing our suspected predictors we still have strong positive correlation with 0.98% and strong negative corelation with -0.99%, we need to find which of them are highly correlated
high = findCorrelation(correlationMatrix, cutoff = 0.75, names = TRUE)
high
## [1] "MHHUUR" "MZPART" "MSKA" "MRELOV" "PBRAND" "PWAPART"
## [7] "PTRACTOR" "APERSAUT" "AWABEDR" "MGODGE" "AAANHANG" "PWALAND"
## [13] "PZEILPL" "PBESAUT" "APLEZIER" "PBYSTAND" "PGEZONG" "AWERKT"
## [19] "AWAOREG" "PLEVEN" "PFIETS" "PINBOED" "PBROM" "PPERSONG"
## [25] "PMOTSCO" "AVRAAUT"
# there are 26 variables which are correlated with each other, before dropping them we need to see how they are correlated with response variable.
length(high)
## [1] 26
target_cor_df = data.frame(CARAVAN = cor(df_train[,sort(high)], df_train[, "CARAVAN"]))
cor_df = target_cor_df[order(target_cor_df$CARAVAN,decreasing = T),,drop=F]
excludedVariables = row.names(cor_df[cor_df$CARAVAN < 0.1, ,drop=F])
excludedVariables
## [1] "PWAPART" "PBRAND" "MSKA" "PBYSTAND" "MZPART" "PGEZONG"
## [7] "PFIETS" "AWAOREG" "PLEVEN" "PZEILPL" "AAANHANG" "PMOTSCO"
## [13] "PINBOED" "AWABEDR" "PBESAUT" "AVRAAUT" "PPERSONG" "PTRACTOR"
## [19] "AWERKT" "PWALAND" "MGODGE" "PBROM" "MRELOV" "MHHUUR"
paste0("excluding total variables from main data set ", length(excludedVariables))
## [1] "excluding total variables from main data set 24"
# There are 24 variables which are less correlated with response variable and having correlation coefficient less than 0.1 so we will exclude them.
train_2 = data.frame(train_2[, !colnames(train_2) %in% excludedVariables])
names(train_2)
## [1] "MOSTYPE" "MGODRK" "MGODPR" "MGODOV" "MRELGE" "MRELSA"
## [7] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [13] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [19] "MSKB1" "MSKB2" "MSKC" "MSKD" "MHKOOP" "MAUT1"
## [25] "MAUT2" "MAUT0" "MZFONDS" "MINKM30" "MINK3045" "MINK4575"
## [31] "MINK7512" "MINK123M" "MKOOPKLA" "PWABEDR" "PPERSAUT" "PVRAAUT"
## [37] "PAANHANG" "PWERKT" "PWAOREG" "PPLEZIER" "AWAPART" "AWALAND"
## [43] "APERSAUT" "ABESAUT" "AMOTSCO" "ATRACTOR" "ABROM" "ALEVEN"
## [49] "APERSONG" "AGEZONG" "ABRAND" "AZEILPL" "APLEZIER" "AFIETS"
## [55] "AINBOED" "ABYSTAND" "CARAVAN"
length(train_2)
## [1] 57
# corrplot(cor(train_3), method = "number")
# 24 + 5 (predictors in model) = 29
# around 29 variables have been excluded from set so far next step would be to find good predictors which are not highly correlated each other and are significant.
# we have reduce dimension from 86 to 57
# corelation between no of car policy and carvan
cor(df_train$APERSAUT, df_train$CARAVAN)
## [1] 0.1442105
# # corelation between contribution car policies and carvan
cor(df_train$PPERSAUT, df_train$CARAVAN)
## [1] 0.1509097
# # corelation between Purchasing power class and carvan
cor(df_train$MKOOPKLA, df_train$CARAVAN)
## [1] 0.09593826
new_train_2 = train_2
for(i in 1:ncol(new_train_2)){
new_train_2[,i] <- as.numeric(new_train_2[,i])
}
cor_response = data.frame("ind_var" = colnames(new_train_2), "dep_var" = "CARAVAN", "cor_coeff" = 0, "p_values" = 0)
for (i in colnames(new_train_2)){
cor_test <- cor.test(new_train_2[,i], new_train_2[,"CARAVAN"])
cor_response[cor_response$ind_var == i, "correlation_coefficient"] = cor_test$estimate
cor_response[cor_response$ind_var == i, "p_values"] = cor_test$p.value
}
cor_response[order(cor_response$cor_coeff, decreasing = T),]
## ind_var dep_var cor_coeff p_values correlation_coefficient
## 1 MOSTYPE CARAVAN 0 4.415694e-46 -0.135586889
## 2 MGODRK CARAVAN 0 2.583334e-01 0.010803843
## 3 MGODPR CARAVAN 0 3.231555e-12 0.066521701
## 4 MGODOV CARAVAN 0 3.624139e-01 0.008705328
## 5 MRELGE CARAVAN 0 2.020100e-65 0.162193454
## 6 MRELSA CARAVAN 0 2.489261e-14 -0.072761240
## 7 MFALLEEN CARAVAN 0 4.095222e-41 -0.127830892
## 8 MFGEKIND CARAVAN 0 2.308525e-01 0.011452092
## 9 MFWEKIND CARAVAN 0 1.871244e-16 0.078544087
## 10 MOPLHOOG CARAVAN 0 2.694548e-65 0.162035905
## 11 MOPLMIDD CARAVAN 0 2.549701e-22 0.092679218
## 12 MOPLLAAG CARAVAN 0 5.086443e-87 -0.187321638
## 13 MBERHOOG CARAVAN 0 4.801007e-40 0.126097758
## 14 MBERZELF CARAVAN 0 3.436983e-06 0.044355711
## 15 MBERBOER CARAVAN 0 2.812603e-39 -0.124837889
## 16 MBERMIDD CARAVAN 0 6.530849e-25 0.098282054
## 17 MBERARBG CARAVAN 0 1.966259e-20 -0.088376598
## 18 MBERARBO CARAVAN 0 1.086222e-37 -0.122192046
## 19 MSKB1 CARAVAN 0 8.813456e-12 0.065163033
## 20 MSKB2 CARAVAN 0 4.431230e-01 0.007330476
## 21 MSKC CARAVAN 0 6.375227e-19 -0.084775686
## 22 MSKD CARAVAN 0 1.023999e-49 -0.140985103
## 23 MHKOOP CARAVAN 0 8.026033e-79 0.178225584
## 24 MAUT1 CARAVAN 0 6.500578e-54 0.146964982
## 25 MAUT2 CARAVAN 0 3.165699e-02 0.020535970
## 26 MAUT0 CARAVAN 0 2.805594e-69 -0.166974459
## 27 MZFONDS CARAVAN 0 1.652767e-35 -0.118455005
## 28 MINKM30 CARAVAN 0 6.011566e-78 -0.177226008
## 29 MINK3045 CARAVAN 0 6.399294e-01 -0.004471342
## 30 MINK4575 CARAVAN 0 1.570240e-35 0.118493703
## 31 MINK7512 CARAVAN 0 3.180369e-38 0.123087973
## 32 MINK123M CARAVAN 0 5.496601e-01 -0.005718413
## 33 MKOOPKLA CARAVAN 0 9.928322e-101 0.201559037
## 34 PWABEDR CARAVAN 0 8.057106e-01 0.002350997
## 35 PPERSAUT CARAVAN 0 7.707523e-290 0.337512725
## 36 PVRAAUT CARAVAN 0 3.485079e-03 -0.027917884
## 37 PAANHANG CARAVAN 0 1.397315e-02 0.023490693
## 38 PWERKT CARAVAN 0 1.875280e-05 -0.040885355
## 39 PWAOREG CARAVAN 0 1.281517e-08 0.054332527
## 40 PPLEZIER CARAVAN 0 7.019402e-28 0.104330662
## 41 AWAPART CARAVAN 0 1.924030e-92 0.193087412
## 42 AWALAND CARAVAN 0 1.186585e-06 -0.046408075
## 43 APERSAUT CARAVAN 0 4.738411e-225 0.299082035
## 44 ABESAUT CARAVAN 0 1.034142e-01 -0.015565015
## 45 AMOTSCO CARAVAN 0 7.869497e-02 0.016805316
## 46 ATRACTOR CARAVAN 0 1.650151e-05 -0.041156296
## 47 ABROM CARAVAN 0 7.781814e-37 -0.120741653
## 48 ALEVEN CARAVAN 0 2.431827e-12 0.066901849
## 49 APERSONG CARAVAN 0 1.563785e-01 -0.013546952
## 50 AGEZONG CARAVAN 0 5.895091e-08 0.051790072
## 51 ABRAND CARAVAN 0 7.039583e-60 0.155053827
## 52 AZEILPL CARAVAN 0 2.039983e-04 0.035488595
## 53 APLEZIER CARAVAN 0 3.017601e-35 0.117999215
## 54 AFIETS CARAVAN 0 3.534313e-11 0.063235684
## 55 AINBOED CARAVAN 0 2.257429e-04 0.035243207
## 56 ABYSTAND CARAVAN 0 5.291765e-23 0.094187928
## 57 CARAVAN CARAVAN 0 0.000000e+00 1.000000000
corrplot(cor(subset(new_train_2 , select = c(-CARAVAN))), method = "square", type = "upper")
# corelation between Contribution car policies and number of car policies
cor(df_train$PPERSAUT, df_train$APERSAUT)
## [1] 0.9161545
# There is a high correlation between car policies and number of car policies we will exclude a variable which has less correlation with response variable.
# it's not always necessary to see how much it relates which response variable but it's good as it's tells us how much response variable changes for given predictor.
cor(df_train[ , c("PPERSAUT", "APERSAUT")], df_train[ , "CARAVAN"])
## CARAVAN
## PPERSAUT 0.1509097
## APERSAUT 0.1442105
# Contribution car policies is more correlated with response variable so we exclude Number of car policies
train_2 = data.frame(train_2[ , !colnames(train_2) %in% c("APERSAUT")])
paste0("after removing APERSAUT dimension is", length(train_2))
## [1] "after removing APERSAUT dimension is56"
# final variable selected
names(train_2)
## [1] "MOSTYPE" "MGODRK" "MGODPR" "MGODOV" "MRELGE" "MRELSA"
## [7] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [13] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [19] "MSKB1" "MSKB2" "MSKC" "MSKD" "MHKOOP" "MAUT1"
## [25] "MAUT2" "MAUT0" "MZFONDS" "MINKM30" "MINK3045" "MINK4575"
## [31] "MINK7512" "MINK123M" "MKOOPKLA" "PWABEDR" "PPERSAUT" "PVRAAUT"
## [37] "PAANHANG" "PWERKT" "PWAOREG" "PPLEZIER" "AWAPART" "AWALAND"
## [43] "ABESAUT" "AMOTSCO" "ATRACTOR" "ABROM" "ALEVEN" "APERSONG"
## [49] "AGEZONG" "ABRAND" "AZEILPL" "APLEZIER" "AFIETS" "AINBOED"
## [55] "ABYSTAND" "CARAVAN"
length(train_2)
## [1] 56
step.wise1 = glm(CARAVAN ~ ., data = train_2, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(step.wise1)
# difference between null deviance. = 15177 - 12386 = 2791
# step(step.wise1, direction = "backwards")
# after running backward dimension has reduce to from 56 to 41 variables
# null deviance = 15177 - 12394 = 2783
for(i in 1:ncol(train_2)){
train_2[,i] <- as.factor(train_2[,i])
}
# model.2 = glm(formula = CARAVAN ~ MAANTHUI + MGEMOMV + MGEMLEEF + MOSHOOFD +
# MGODRK + MGODPR + MGODOV + MGODGE + MRELGE + MRELSA + MFWEKIND + MOPLMIDD, family = binomial(link = "logit"), data = train_2)
model.2 = glm(formula = CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV +
MRELGE + MRELSA + MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF +
MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKC + MSKD +
MHKOOP + MAUT1 + MAUT2 + MAUT0 + MINK3045 + MINK7512 + MINK123M +
MKOOPKLA + PPERSAUT + PMOTSCO + PVRAAUT + PAANHANG + PWERKT +
PWAOREG + PPLEZIER + AWAPART + AWALAND + ABROM + ALEVEN +
APERSONG + AGEZONG + ABRAND + APLEZIER + AFIETS + ABYSTAND,
family = binomial(link = "logit"), data = over_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.2$xlevels[["MSKD"]] <- union(model.2$xlevels[["MSKD"]], levels(over_test$MSKD))
model.2$xlevels[["MAUT2"]] <- union(model.2$xlevels[["MAUT2"]], levels(over_test$MAUT2))
model.2$xlevels[["MINK123M"]] <- union(model.2$xlevels[["MINK123M"]], levels(over_test$MINK123M))
model.2$xlevels[["PPERSAUT"]] <- union(model.2$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
model.2$xlevels[["PVRAAUT"]] <- union(model.2$xlevels[["PVRAAUT"]], levels(over_test$PVRAAUT))
model.2$xlevels[["PWERKT"]] <- union(model.2$xlevels[["PWERKT"]], levels(over_test$PWERKT))
model.2$xlevels[["ABROM"]] <- union(model.2$xlevels[["ABROM"]], levels(over_test$ABROM))
model.2$xlevels[["ALEVEN"]] <- union(model.2$xlevels[["ALEVEN"]], levels(over_test$ALEVEN))
model.2$xlevels[["ABRAND"]] <- union(model.2$xlevels[["ABRAND"]], levels(over_test$ABRAND))
model.2$xlevels[["AFIETS"]] <- union(model.2$xlevels[["AFIETS"]], levels(over_test$AFIETS))
drewSummary(model.2)
##
## Call:
## glm(formula = CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV +
## MRELGE + MRELSA + MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF +
## MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKC + MSKD +
## MHKOOP + MAUT1 + MAUT2 + MAUT0 + MINK3045 + MINK7512 + MINK123M +
## MKOOPKLA + PPERSAUT + PMOTSCO + PVRAAUT + PAANHANG + PWERKT +
## PWAOREG + PPLEZIER + AWAPART + AWALAND + ABROM + ALEVEN +
## APERSONG + AGEZONG + ABRAND + APLEZIER + AFIETS + ABYSTAND,
## family = binomial(link = "logit"), data = over_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3431 -0.6506 0.0000 0.7786 2.1267
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.41685 4579.63857 -0.002 0.998708
## MOSTYPE2 -0.10324 0.71288 -0.145 0.884854
## MOSTYPE3 -0.16883 0.66344 -0.254 0.799126
## MOSTYPE4 -1.08821 0.71197 -1.528 0.126402
## MOSTYPE5 2.72226 1.60590 1.695 0.090046 .
## MOSTYPE6 -0.47535 0.26644 -1.784 0.074417 .
## MOSTYPE7 -1.14561 0.69688 -1.644 0.100193
## MOSTYPE8 0.55079 0.47531 1.159 0.246531
## MOSTYPE9 2.21767 1.54445 1.436 0.151033
## MOSTYPE10 -0.28980 0.26709 -1.085 0.277918
## MOSTYPE11 -0.26363 0.65487 -0.403 0.687265
## MOSTYPE12 1.93786 0.48671 3.982 6.85e-05 ***
## MOSTYPE13 -1.21544 0.66186 -1.836 0.066299 .
## MOSTYPE15 -27.10438 3095.65811 -0.009 0.993014
## MOSTYPE16 -28.27421 1749.33018 -0.016 0.987104
## MOSTYPE17 -17.98471 1879.42406 -0.010 0.992365
## MOSTYPE18 -30.12218 1771.39099 -0.017 0.986433
## MOSTYPE19 -16.14076 3176.46592 -0.005 0.995946
## MOSTYPE20 -13.14323 1366.24381 -0.010 0.992324
## MOSTYPE21 -28.20314 1992.20104 -0.014 0.988705
## MOSTYPE22 -13.46425 1366.24373 -0.010 0.992137
## MOSTYPE23 2.14768 1.59505 1.346 0.178153
## MOSTYPE24 -13.31532 1366.24376 -0.010 0.992224
## MOSTYPE25 -13.48114 1366.24395 -0.010 0.992127
## MOSTYPE26 -11.79190 1366.24401 -0.009 0.993114
## MOSTYPE27 -14.27165 1366.24401 -0.010 0.991666
## MOSTYPE28 -30.71025 1698.47508 -0.018 0.985574
## MOSTYPE29 2.57630 1.61237 1.598 0.110079
## MOSTYPE30 -12.50424 1366.24377 -0.009 0.992698
## MOSTYPE31 -12.62243 1366.24395 -0.009 0.992629
## MOSTYPE32 -12.14105 1366.24395 -0.009 0.992910
## MOSTYPE33 3.71042 1.58307 2.344 0.019087 *
## MOSTYPE34 -0.41054 0.67496 -0.608 0.543026
## MOSTYPE35 1.64611 1.51092 1.089 0.275945
## MOSTYPE36 4.32379 1.58836 2.722 0.006485 **
## MOSTYPE37 2.98559 1.54804 1.929 0.053777 .
## MOSTYPE38 3.10214 1.54153 2.012 0.044180 *
## MOSTYPE39 2.29400 1.50275 1.527 0.126878
## MOSTYPE40 -16.24945 638.31965 -0.025 0.979691
## MOSTYPE41 2.06908 1.55430 1.331 0.183124
## MGODRK1 0.40285 0.09280 4.341 1.42e-05 ***
## MGODRK2 0.33758 0.10358 3.259 0.001118 **
## MGODRK3 -0.76994 0.23278 -3.308 0.000941 ***
## MGODRK4 -1.16676 0.47268 -2.468 0.013572 *
## MGODRK5 1.28077 0.52556 2.437 0.014812 *
## MGODRK6 0.40570 0.67000 0.606 0.544830
## MGODRK7 -15.75900 2433.76873 -0.006 0.994834
## MGODRK8 -15.21832 3403.75972 -0.004 0.996433
## MGODRK9 -14.94526 2408.35442 -0.006 0.995049
## MGODPR1 0.78586 0.55549 1.415 0.157156
## MGODPR2 2.55851 0.50764 5.040 4.65e-07 ***
## MGODPR3 2.45086 0.50261 4.876 1.08e-06 ***
## MGODPR4 2.67961 0.50108 5.348 8.91e-08 ***
## MGODPR5 2.51578 0.49812 5.051 4.40e-07 ***
## MGODPR6 2.33213 0.49841 4.679 2.88e-06 ***
## MGODPR7 3.22119 0.50732 6.349 2.16e-10 ***
## MGODPR8 2.81991 0.58187 4.846 1.26e-06 ***
## MGODPR9 2.53108 0.53082 4.768 1.86e-06 ***
## MGODOV1 -0.52840 0.09715 -5.439 5.35e-08 ***
## MGODOV2 -0.04173 0.09437 -0.442 0.658345
## MGODOV3 0.46006 0.16416 2.802 0.005072 **
## MGODOV4 -0.89691 0.25812 -3.475 0.000511 ***
## MGODOV5 1.59640 0.51383 3.107 0.001891 **
## MRELGE1 -18.17325 416.44469 -0.044 0.965192
## MRELGE2 -2.18561 0.58357 -3.745 0.000180 ***
## MRELGE3 -0.68288 0.52201 -1.308 0.190813
## MRELGE4 -1.14784 0.50224 -2.285 0.022288 *
## MRELGE5 -0.77731 0.48886 -1.590 0.111824
## MRELGE6 -1.01538 0.49558 -2.049 0.040473 *
## MRELGE7 -0.41238 0.49275 -0.837 0.402650
## MRELGE8 -0.34251 0.51075 -0.671 0.502468
## MRELGE9 -0.38500 0.49606 -0.776 0.437683
## MRELSA1 0.02736 0.09511 0.288 0.773600
## MRELSA2 0.04240 0.11325 0.374 0.708126
## MRELSA3 1.20022 0.23333 5.144 2.69e-07 ***
## MRELSA4 -1.82819 0.51742 -3.533 0.000410 ***
## MRELSA5 -17.11939 1300.75148 -0.013 0.989499
## MRELSA6 -18.31177 1560.94502 -0.012 0.990640
## MRELSA7 -0.79912 6535.91926 0.000 0.999902
## MOPLMIDD1 -0.32384 0.31794 -1.019 0.308403
## MOPLMIDD2 -0.82103 0.27005 -3.040 0.002363 **
## MOPLMIDD3 -1.20389 0.26250 -4.586 4.51e-06 ***
## MOPLMIDD4 -1.61346 0.26607 -6.064 1.33e-09 ***
## MOPLMIDD5 -1.62228 0.27624 -5.873 4.29e-09 ***
## MOPLMIDD6 -1.67106 0.30278 -5.519 3.41e-08 ***
## MOPLMIDD7 -1.68838 0.34193 -4.938 7.90e-07 ***
## MOPLMIDD8 -2.67864 0.49748 -5.384 7.27e-08 ***
## MOPLMIDD9 -2.87221 0.44908 -6.396 1.60e-10 ***
## MOPLLAAG1 -0.58933 0.28750 -2.050 0.040379 *
## MOPLLAAG2 -0.50970 0.25183 -2.024 0.042972 *
## MOPLLAAG3 -0.83420 0.25112 -3.322 0.000894 ***
## MOPLLAAG4 -1.50497 0.25958 -5.798 6.72e-09 ***
## MOPLLAAG5 -1.86201 0.27526 -6.764 1.34e-11 ***
## MOPLLAAG6 -2.13325 0.29978 -7.116 1.11e-12 ***
## MOPLLAAG7 -2.37819 0.33730 -7.051 1.78e-12 ***
## MOPLLAAG8 -2.61895 0.41504 -6.310 2.79e-10 ***
## MOPLLAAG9 -4.63719 0.45506 -10.190 < 2e-16 ***
## MBERHOOG1 -0.21111 0.11691 -1.806 0.070965 .
## MBERHOOG2 0.42619 0.13162 3.238 0.001204 **
## MBERHOOG3 0.06442 0.18401 0.350 0.726263
## MBERHOOG4 0.77379 0.23736 3.260 0.001114 **
## MBERHOOG5 0.25770 0.31671 0.814 0.415830
## MBERHOOG6 1.82551 0.38131 4.787 1.69e-06 ***
## MBERHOOG7 2.48575 0.47462 5.237 1.63e-07 ***
## MBERHOOG8 4.38061 0.69155 6.334 2.38e-10 ***
## MBERHOOG9 1.04553 0.66383 1.575 0.115257
## MBERZELF1 0.24292 0.10194 2.383 0.017175 *
## MBERZELF2 0.43503 0.15061 2.889 0.003870 **
## MBERZELF3 0.61528 0.43048 1.429 0.152917
## MBERZELF4 -0.04689 0.69496 -0.067 0.946209
## MBERZELF5 -0.27715 0.50437 -0.549 0.582670
## MBERBOER1 0.22806 0.10311 2.212 0.026989 *
## MBERBOER2 0.26898 0.16473 1.633 0.102513
## MBERBOER3 1.06456 0.29376 3.624 0.000290 ***
## MBERBOER4 -0.34118 0.45635 -0.748 0.454681
## MBERBOER5 3.45087 0.57858 5.964 2.46e-09 ***
## MBERBOER6 -12.21960 1211.19733 -0.010 0.991950
## MBERBOER7 4.28602 3679.77206 0.001 0.999071
## MBERBOER8 9.57676 2624.35449 0.004 0.997088
## MBERBOER9 6.53143 3285.83279 0.002 0.998414
## MBERMIDD1 -0.72007 0.20360 -3.537 0.000405 ***
## MBERMIDD2 0.07983 0.17384 0.459 0.646065
## MBERMIDD3 0.06615 0.20720 0.319 0.749511
## MBERMIDD4 0.87849 0.25001 3.514 0.000442 ***
## MBERMIDD5 0.82237 0.31053 2.648 0.008091 **
## MBERMIDD6 1.53755 0.36916 4.165 3.11e-05 ***
## MBERMIDD7 1.55577 0.45750 3.401 0.000672 ***
## MBERMIDD8 -19.80784 1294.50449 -0.015 0.987792
## MBERMIDD9 2.67522 0.55795 4.795 1.63e-06 ***
## MBERARBG1 1.13599 0.13617 8.342 < 2e-16 ***
## MBERARBG2 0.98691 0.15747 6.267 3.68e-10 ***
## MBERARBG3 0.66523 0.20241 3.286 0.001014 **
## MBERARBG4 0.84202 0.25466 3.306 0.000945 ***
## MBERARBG5 1.78966 0.32328 5.536 3.09e-08 ***
## MBERARBG6 0.03182 0.42382 0.075 0.940145
## MBERARBG7 3.51117 0.56369 6.229 4.70e-10 ***
## MBERARBG8 2.91265 0.62167 4.685 2.80e-06 ***
## MBERARBG9 4.47032 0.82846 5.396 6.82e-08 ***
## MBERARBO1 -0.14848 0.13040 -1.139 0.254845
## MBERARBO2 -0.05052 0.14875 -0.340 0.734118
## MBERARBO3 0.25410 0.18450 1.377 0.168451
## MBERARBO4 0.89950 0.23547 3.820 0.000133 ***
## MBERARBO5 1.32839 0.31491 4.218 2.46e-05 ***
## MBERARBO6 0.61644 0.41215 1.496 0.134736
## MBERARBO7 1.56012 0.61400 2.541 0.011056 *
## MBERARBO8 4.34834 0.87024 4.997 5.83e-07 ***
## MBERARBO9 -16.14269 948.86166 -0.017 0.986426
## MSKC1 0.62343 0.26970 2.312 0.020804 *
## MSKC2 0.58111 0.23777 2.444 0.014524 *
## MSKC3 0.59414 0.23462 2.532 0.011331 *
## MSKC4 0.84329 0.24768 3.405 0.000662 ***
## MSKC5 0.91687 0.25345 3.618 0.000297 ***
## MSKC6 1.46236 0.28491 5.133 2.86e-07 ***
## MSKC7 1.40819 0.31689 4.444 8.84e-06 ***
## MSKC8 2.59305 0.40569 6.392 1.64e-10 ***
## MSKC9 0.18280 0.44003 0.415 0.677830
## MSKD1 0.11917 0.09555 1.247 0.212333
## MSKD2 -0.03512 0.11397 -0.308 0.757974
## MSKD3 -0.41328 0.16551 -2.497 0.012521 *
## MSKD4 -0.44892 0.24994 -1.796 0.072474 .
## MSKD5 -1.70222 0.60611 -2.808 0.004978 **
## MSKD6 -16.10247 520.21397 -0.031 0.975307
## MSKD7 2.98897 1.17222 2.550 0.010778 *
## MSKD9 -14.59765 6522.63864 -0.002 0.998214
## MHKOOP1 -0.55352 0.18145 -3.051 0.002285 **
## MHKOOP2 -0.86983 0.16796 -5.179 2.23e-07 ***
## MHKOOP3 -0.34876 0.16428 -2.123 0.033758 *
## MHKOOP4 -0.42117 0.16418 -2.565 0.010309 *
## MHKOOP5 -0.42347 0.15858 -2.670 0.007577 **
## MHKOOP6 -0.17152 0.15691 -1.093 0.274343
## MHKOOP7 -0.64088 0.15196 -4.217 2.47e-05 ***
## MHKOOP8 -0.26383 0.16537 -1.595 0.110621
## MHKOOP9 -0.19365 0.15298 -1.266 0.205552
## MAUT11 9.66462 7969.81180 0.001 0.999032
## MAUT12 12.79757 4371.09442 0.003 0.997664
## MAUT13 12.43480 4371.09436 0.003 0.997730
## MAUT14 15.52138 4371.09434 0.004 0.997167
## MAUT15 16.09448 4371.09435 0.004 0.997062
## MAUT16 16.14883 4371.09436 0.004 0.997052
## MAUT17 16.45876 4371.09436 0.004 0.996996
## MAUT18 16.57530 4371.09438 0.004 0.996974
## MAUT19 15.84595 4371.09439 0.004 0.997108
## MAUT21 0.21571 0.11864 1.818 0.069037 .
## MAUT22 -0.13414 0.16256 -0.825 0.409269
## MAUT23 -0.25076 0.25931 -0.967 0.333518
## MAUT24 0.33153 0.34615 0.958 0.338187
## MAUT25 0.07256 0.58927 0.123 0.901996
## MAUT26 4.40776 1.15852 3.805 0.000142 ***
## MAUT27 2.88501 6619.67724 0.000 0.999652
## MAUT01 -0.11959 0.14176 -0.844 0.398896
## MAUT02 0.34426 0.16545 2.081 0.037454 *
## MAUT03 0.76243 0.25202 3.025 0.002484 **
## MAUT04 -0.96359 0.35664 -2.702 0.006896 **
## MAUT05 0.87832 0.50280 1.747 0.080664 .
## MAUT06 4.07294 0.89493 4.551 5.34e-06 ***
## MAUT07 -57.38433 1092.31229 -0.053 0.958103
## MAUT08 -6.10220 6772.90905 -0.001 0.999281
## MAUT09 -3.16188 4526.53656 -0.001 0.999443
## MINK30451 0.11489 0.20108 0.571 0.567759
## MINK30452 0.40932 0.15932 2.569 0.010193 *
## MINK30453 0.22125 0.15447 1.432 0.152071
## MINK30454 0.05235 0.15478 0.338 0.735204
## MINK30455 -0.10527 0.16433 -0.641 0.521802
## MINK30456 -0.06767 0.18046 -0.375 0.707673
## MINK30457 0.17984 0.22223 0.809 0.418375
## MINK30458 0.09146 0.40537 0.226 0.821496
## MINK30459 1.49863 0.29286 5.117 3.10e-07 ***
## MINK75121 0.55873 0.09104 6.137 8.40e-10 ***
## MINK75122 0.20065 0.10795 1.859 0.063078 .
## MINK75123 0.18764 0.15337 1.223 0.221142
## MINK75124 0.66149 0.19793 3.342 0.000831 ***
## MINK75125 -0.63363 0.29297 -2.163 0.030557 *
## MINK75126 -18.04942 1826.99560 -0.010 0.992118
## MINK75127 -21.48448 6522.63864 -0.003 0.997372
## MINK75128 -20.17604 2453.54957 -0.008 0.993439
## MINK75129 3.81007 0.90977 4.188 2.81e-05 ***
## MINK123M1 -0.23754 0.10816 -2.196 0.028086 *
## MINK123M2 -0.05399 0.24291 -0.222 0.824110
## MINK123M3 -1.14967 0.40195 -2.860 0.004233 **
## MINK123M4 -18.27140 1129.29670 -0.016 0.987091
## MINK123M5 -19.42596 6522.63862 -0.003 0.997624
## MINK123M7 -23.06411 6522.63865 -0.004 0.997179
## MINK123M9 -17.33042 6522.63868 -0.003 0.997880
## MKOOPKLA2 0.42918 0.70559 0.608 0.543019
## MKOOPKLA3 -15.74138 1366.24300 -0.012 0.990807
## MKOOPKLA4 -14.76526 1366.24305 -0.011 0.991377
## MKOOPKLA5 -14.14797 1366.24311 -0.010 0.991738
## MKOOPKLA6 -11.85123 1366.24377 -0.009 0.993079
## MKOOPKLA7 -12.64922 1366.24385 -0.009 0.992613
## MKOOPKLA8 -12.72138 1366.24391 -0.009 0.992571
## PPERSAUT4 -17.74738 6522.63861 -0.003 0.997829
## PPERSAUT5 -0.23505 0.11815 -1.989 0.046648 *
## PPERSAUT6 1.79486 0.06328 28.364 < 2e-16 ***
## PPERSAUT7 -17.69278 815.91470 -0.022 0.982700
## PPERSAUT8 -16.21935 4248.45184 -0.004 0.996954
## PMOTSCO3 3.60728 1.56510 2.305 0.021176 *
## PMOTSCO4 -0.19549 0.17860 -1.095 0.273729
## PMOTSCO5 0.12055 0.30648 0.393 0.694079
## PMOTSCO6 -2.73265 0.53918 -5.068 4.02e-07 ***
## PMOTSCO7 -18.81346 4310.32002 -0.004 0.996517
## PVRAAUT4 -19.00878 6522.63861 -0.003 0.997675
## PVRAAUT6 -16.93379 2122.80799 -0.008 0.993635
## PVRAAUT9 0.30845 6573.47178 0.000 0.999963
## PAANHANG1 1.49536 0.45640 3.276 0.001051 **
## PAANHANG2 0.93465 0.37819 2.471 0.013461 *
## PAANHANG3 -16.41691 2219.91357 -0.007 0.994099
## PAANHANG4 -19.03218 6522.63862 -0.003 0.997672
## PAANHANG5 -2.59521 6859.38251 0.000 0.999698
## PWERKT2 -16.15994 2250.50380 -0.007 0.994271
## PWERKT3 -16.41642 2024.09365 -0.008 0.993529
## PWERKT4 -15.10514 1862.13909 -0.008 0.993528
## PWERKT6 -13.59613 2980.08389 -0.005 0.996360
## PWAOREG4 -19.01229 6522.63861 -0.003 0.997674
## PWAOREG5 -15.88444 6522.63862 -0.002 0.998057
## PWAOREG6 3.03697 0.41525 7.314 2.60e-13 ***
## PWAOREG7 -18.78709 3215.37010 -0.006 0.995338
## PPLEZIER1 5.28855 1.58056 3.346 0.000820 ***
## PPLEZIER2 0.28372 1.46690 0.193 0.846636
## PPLEZIER3 2.87275 1.45969 1.968 0.049062 *
## PPLEZIER4 0.90003 1.21630 0.740 0.459318
## PPLEZIER5 -16.16662 3841.44362 -0.004 0.996642
## PPLEZIER6 50.42224 854.52137 0.059 0.952947
## AWAPART1 0.45023 0.07350 6.126 9.04e-10 ***
## AWAPART2 -17.91175 2140.36143 -0.008 0.993323
## AWALAND1 -1.62608 0.27649 -5.881 4.07e-09 ***
## ABROM1 -0.81126 0.15225 -5.328 9.90e-08 ***
## ABROM2 -17.50707 1537.26921 -0.011 0.990914
## ALEVEN1 -0.93333 0.17662 -5.284 1.26e-07 ***
## ALEVEN2 0.06703 0.18209 0.368 0.712793
## ALEVEN3 0.35697 0.45900 0.778 0.436738
## ALEVEN4 2.11320 0.65493 3.227 0.001253 **
## ALEVEN8 -0.83679 7784.22492 0.000 0.999914
## APERSONG1 -3.87707 0.91367 -4.243 2.20e-05 ***
## AGEZONG1 0.25353 0.30356 0.835 0.403601
## ABRAND1 0.38349 0.07494 5.117 3.10e-07 ***
## ABRAND2 0.16098 0.19968 0.806 0.420133
## ABRAND3 -19.01316 1746.28080 -0.011 0.991313
## ABRAND4 -16.83111 2931.62747 -0.006 0.995419
## ABRAND5 -12.93114 3884.72146 -0.003 0.997344
## ABRAND7 -16.29095 6522.63861 -0.002 0.998007
## APLEZIER1 0.99051 1.31145 0.755 0.450082
## APLEZIER2 NA NA NA NA
## AFIETS1 1.00642 0.19752 5.095 3.48e-07 ***
## AFIETS2 0.74848 0.29038 2.578 0.009948 **
## AFIETS3 2.79384 1.17912 2.369 0.017816 *
## ABYSTAND1 0.78205 0.19977 3.915 9.05e-05 ***
## ABYSTAND2 -18.64288 6522.63861 -0.003 0.997720
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177.2 on 10947 degrees of freedom
## Residual deviance: 9775.8 on 10662 degrees of freedom
## AIC: 10348
##
## Number of Fisher Scoring iterations: 17
drewMatrix(model.2, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2001 110
## 1 1761 128
##
## Accuracy : 0.5322
## 95% CI : (0.5166, 0.5478)
## No Information Rate : 0.9405
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0164
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.53782
## Specificity : 0.53190
## Pos Pred Value : 0.06776
## Neg Pred Value : 0.94789
## Prevalence : 0.05950
## Detection Rate : 0.03200
## Detection Prevalence : 0.47225
## Balanced Accuracy : 0.53486
##
## 'Positive' Class : 1
##
# difference in deviance = Null deviance (15177.2) - 9766.3 = 5411
# Sensitivity : 64%
# Accuracy : 54%
corrplot(cor(subset(df_train , select = c("PBRAND", "MOSTYPE", "PPERSAUT", "MKOOPKLA", "MHKOOP", "CARAVAN"))), method = "number", type = "upper")
train_3 = over_train
for(i in 1:ncol(train_3)){
train_3[,i] <- as.factor(train_3[,i])
}
model.3 = glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA+MHKOOP, family = binomial(link = "logit"),
data = train_3)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.3$xlevels[["PPERSAUT"]] <- union(model.3$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
predicted_3 = predict(model.3, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_3 = ifelse(predicted_3>=0.5, 1, 0)
drewSummary(model.3)
##
## Call:
## glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA +
## MHKOOP, family = binomial(link = "logit"), data = train_3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2188 -0.8846 0.2112 0.8877 2.0655
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.56737 542.41896 0.025 0.980045
## PBRAND1 -0.44457 0.18805 -2.364 0.018071 *
## PBRAND2 -1.09720 0.13286 -8.258 < 2e-16 ***
## PBRAND3 0.70914 0.06417 11.051 < 2e-16 ***
## PBRAND4 0.86543 0.05584 15.498 < 2e-16 ***
## PBRAND5 0.01437 0.14206 0.101 0.919446
## PBRAND6 -0.71539 0.19039 -3.758 0.000172 ***
## PBRAND7 -16.09720 662.23080 -0.024 0.980607
## PBRAND8 -16.67399 2399.54472 -0.007 0.994456
## MOSTYPE2 -1.55692 0.49312 -3.157 0.001592 **
## MOSTYPE3 -1.57461 0.47086 -3.344 0.000825 ***
## MOSTYPE4 -2.27590 0.52690 -4.319 1.56e-05 ***
## MOSTYPE5 0.21351 1.10997 0.192 0.847463
## MOSTYPE6 -0.19143 0.18977 -1.009 0.313093
## MOSTYPE7 -1.79296 0.50819 -3.528 0.000418 ***
## MOSTYPE8 -0.21303 0.35093 -0.607 0.543824
## MOSTYPE9 -0.91550 1.04692 -0.874 0.381859
## MOSTYPE10 -0.55308 0.18924 -2.923 0.003471 **
## MOSTYPE11 -2.10475 0.46146 -4.561 5.09e-06 ***
## MOSTYPE12 -0.12282 0.35074 -0.350 0.726202
## MOSTYPE13 -1.62817 0.46996 -3.464 0.000531 ***
## MOSTYPE15 -30.05373 1154.96824 -0.026 0.979240
## MOSTYPE16 -30.95223 748.96156 -0.041 0.967035
## MOSTYPE17 -17.31495 724.59387 -0.024 0.980936
## MOSTYPE18 -30.79590 728.77089 -0.042 0.966294
## MOSTYPE19 -16.48050 1235.29830 -0.013 0.989355
## MOSTYPE20 -14.89542 542.41874 -0.027 0.978092
## MOSTYPE21 -30.72381 784.93331 -0.039 0.968777
## MOSTYPE22 -15.62265 542.41870 -0.029 0.977023
## MOSTYPE23 -0.50915 1.08747 -0.468 0.639643
## MOSTYPE24 -15.73582 542.41873 -0.029 0.976856
## MOSTYPE25 -15.06992 542.41897 -0.028 0.977835
## MOSTYPE26 -15.51812 542.41905 -0.029 0.977176
## MOSTYPE27 -15.31559 542.41907 -0.028 0.977474
## MOSTYPE28 -30.61014 696.61845 -0.044 0.964951
## MOSTYPE29 -0.57512 1.10107 -0.522 0.601440
## MOSTYPE30 -15.85993 542.41873 -0.029 0.976674
## MOSTYPE31 -15.57445 542.41898 -0.029 0.977094
## MOSTYPE32 -14.40755 542.41897 -0.027 0.978809
## MOSTYPE33 0.28391 1.07957 0.263 0.792566
## MOSTYPE34 -2.19298 0.48035 -4.565 4.99e-06 ***
## MOSTYPE35 -1.83656 1.01195 -1.815 0.069541 .
## MOSTYPE36 0.85769 1.08415 0.791 0.428874
## MOSTYPE37 -0.55322 1.04391 -0.530 0.596148
## MOSTYPE38 -0.33805 1.04363 -0.324 0.745997
## MOSTYPE39 -1.30089 1.01238 -1.285 0.198796
## MOSTYPE40 -15.96420 253.43789 -0.063 0.949774
## MOSTYPE41 -1.28891 1.05010 -1.227 0.219667
## PPERSAUT4 -15.30126 2399.54472 -0.006 0.994912
## PPERSAUT5 -0.17935 0.09533 -1.881 0.059930 .
## PPERSAUT6 1.48920 0.04958 30.034 < 2e-16 ***
## PPERSAUT7 -15.65631 332.76882 -0.047 0.962474
## PPERSAUT8 -15.11916 1115.20566 -0.014 0.989183
## MKOOPKLA2 0.56639 0.52117 1.087 0.277144
## MKOOPKLA3 -15.33692 542.41790 -0.028 0.977443
## MKOOPKLA4 -14.09738 542.41796 -0.026 0.979265
## MKOOPKLA5 -13.53048 542.41802 -0.025 0.980099
## MKOOPKLA6 -12.98108 542.41876 -0.024 0.980907
## MKOOPKLA7 -13.90142 542.41884 -0.026 0.979554
## MKOOPKLA8 -14.67041 542.41893 -0.027 0.978423
## MHKOOP1 -0.23423 0.11634 -2.013 0.044079 *
## MHKOOP2 -0.17357 0.11482 -1.512 0.130606
## MHKOOP3 0.22847 0.11719 1.950 0.051232 .
## MHKOOP4 0.08435 0.10780 0.783 0.433912
## MHKOOP5 -0.12754 0.10708 -1.191 0.233640
## MHKOOP6 0.23910 0.10357 2.308 0.020972 *
## MHKOOP7 0.03356 0.10105 0.332 0.739770
## MHKOOP8 0.56480 0.11241 5.024 5.05e-07 ***
## MHKOOP9 0.34188 0.09857 3.468 0.000524 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177 on 10947 degrees of freedom
## Residual deviance: 11950 on 10879 degrees of freedom
## AIC: 12088
##
## Number of Fisher Scoring iterations: 15
drewMatrix(model.3, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2171 74
## 1 1591 164
##
## Accuracy : 0.5838
## 95% CI : (0.5683, 0.5991)
## No Information Rate : 0.9405
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0668
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.68908
## Specificity : 0.57709
## Pos Pred Value : 0.09345
## Neg Pred Value : 0.96704
## Prevalence : 0.05950
## Detection Rate : 0.04100
## Detection Prevalence : 0.43875
## Balanced Accuracy : 0.63308
##
## 'Positive' Class : 1
##
getRMSE(predictedClass_3)
## [1] 0.8110179
# Area under curve
pr <- prediction(predictedClass_3,over_test$CARAVAN)
perf <- performance(pr,measure = "tpr",x.measure = "fpr")
plot(perf) > auc(over_test$CARAVAN,predictedClass_3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## logical(0)
auc_ROCR <- performance(pr, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
auc_ROCR
## [1] 0.6330811
pR2(model.3)['McFadden']
## fitting null model for pseudo-r2
## McFadden
## 0.2126512
# difference in deviance = Null deviance (15177) - 12051 = 3025
# sensivity 72%
# Accuracy : 60%
Explanation AUC roc plot logistic regression Our AUC score is 0.7176653
pred_t <- predict(model.3, na.action=na.pass)
hist(pred_t)
boxplot(pred_t)
##Plotting residual histograms for training and validation data
resid.t<-residuals(model.3)
hist(resid.t)
drewROC(model.3)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
lift.example <- lift(relevel(as.factor(over_test$CARAVAN), ref="1") ~ predicted_3, data = over_test)
#xyplot(lift.example, plot = "gain")
library(gains)
actual = as.numeric(over_test$CARAVAN)
predicted_3_num = as.numeric(predicted_3)
gain = gains(actual, predicted_3_num)
barplot(gain$mean.resp / mean(actual), names.arg = gain$depth, xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise lift chart")
train_4 = over_train
for(i in 1:ncol(train_4)){
train_4[,i] <- as.factor(train_4[,i])
}
fit1 = rpart(formula=CARAVAN ~ .,data=over_train,method = 'class', control=rpart.control(minsplit=20, minbucket=1, cp=0.008))
fit1
## n= 10948
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 10948 5474 0 (0.50000000 0.50000000)
## 2) PPERSAUT=0,4,5,7,8 4772 1355 0 (0.71605197 0.28394803)
## 4) MOSTYPE=3,4,6,9,10,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,40,41 2532 376 0 (0.85150079 0.14849921) *
## 5) MOSTYPE=1,2,5,7,8,11,12,13,32,36,37,38,39 2240 979 0 (0.56294643 0.43705357)
## 10) PBRAND=1,2,5,6,7,8 214 11 0 (0.94859813 0.05140187) *
## 11) PBRAND=0,3,4 2026 968 0 (0.52221125 0.47778875)
## 22) MGODPR=0,1,4,8 515 140 0 (0.72815534 0.27184466) *
## 23) MGODPR=2,3,5,6,7,9 1511 683 1 (0.45201853 0.54798147)
## 46) PBRAND=0,3 1004 456 0 (0.54581673 0.45418327)
## 92) MBERARBG=0,3,4,6,8 387 91 0 (0.76485788 0.23514212) *
## 93) MBERARBG=1,2,5,7 617 252 1 (0.40842788 0.59157212)
## 186) MINKM30=3,4,7,9 56 0 0 (1.00000000 0.00000000) *
## 187) MINKM30=0,1,2,5,6 561 196 1 (0.34937611 0.65062389) *
## 47) PBRAND=4 507 135 1 (0.26627219 0.73372781) *
## 3) PPERSAUT=6 6176 2057 1 (0.33306347 0.66693653)
## 6) PBRAND=0,1,2,6,7 2287 1131 1 (0.49453432 0.50546568)
## 12) MOSTYPE=5,7,9,16,17,18,19,21,22,23,26,28,29,31,32,40,41 419 65 0 (0.84486874 0.15513126) *
## 13) MOSTYPE=1,2,3,4,6,8,10,11,12,13,20,24,25,27,30,33,34,35,36,37,38,39 1868 777 1 (0.41595289 0.58404711)
## 26) MSKA=0,2,8,9 783 345 0 (0.55938697 0.44061303)
## 52) MOSTYPE=2,4,6,13,20,25,27,30,33,34,35,39 361 89 0 (0.75346260 0.24653740) *
## 53) MOSTYPE=1,3,8,10,11,12,24,36,37,38 422 166 1 (0.39336493 0.60663507)
## 106) MGODGE=0,1,3,6,7 93 14 0 (0.84946237 0.15053763) *
## 107) MGODGE=2,4,5 329 87 1 (0.26443769 0.73556231) *
## 27) MSKA=1,3,4,5,6,7 1085 339 1 (0.31244240 0.68755760)
## 54) MBERARBO=2,5,6,7 242 96 0 (0.60330579 0.39669421)
## 108) MOSTYPE=1,2,3,4,6,8,10,11,13,20,24,27,30,33,36,38 130 15 0 (0.88461538 0.11538462) *
## 109) MOSTYPE=12,25,34,35,37,39 112 31 1 (0.27678571 0.72321429) *
## 55) MBERARBO=0,1,3,4,8 843 193 1 (0.22894425 0.77105575) *
## 7) PBRAND=3,4,5 3889 926 1 (0.23810748 0.76189252) *
printcp(fit1)
##
## Classification tree:
## rpart(formula = CARAVAN ~ ., data = over_train, method = "class",
## control = rpart.control(minsplit = 20, minbucket = 1, cp = 0.008))
##
## Variables actually used in tree construction:
## [1] MBERARBG MBERARBO MGODGE MGODPR MINKM30 MOSTYPE MSKA PBRAND
## [9] PPERSAUT
##
## Root node error: 5474/10948 = 0.5
##
## n= 10948
##
## CP nsplit rel error xerror xstd
## 1 0.3766898 0 1.00000 1.02320 0.0095547
## 2 0.0263975 1 0.62331 0.62349 0.0088540
## 3 0.0169894 3 0.57052 0.57526 0.0086523
## 4 0.0164414 4 0.55353 0.56284 0.0085956
## 5 0.0118743 5 0.53708 0.54786 0.0085246
## 6 0.0091341 6 0.52521 0.52576 0.0084141
## 7 0.0088296 8 0.50694 0.47497 0.0081340
## 8 0.0080000 14 0.43277 0.45177 0.0079930
fancyRpartPlot(fit1)
glm_6 = glm(formula = CARAVAN ~ MINKGEM+PBRAND+PPERSAUT, family = binomial(link = "logit"),data = train_4)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_6$xlevels[["PPERSAUT"]] <- union(glm_6$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
summary(glm_6)
##
## Call:
## glm(formula = CARAVAN ~ MINKGEM + PBRAND + PPERSAUT, family = binomial(link = "logit"),
## data = train_4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1010 -0.8672 0.2414 0.9333 2.1897
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.54726 264.87882 -0.062 0.9502
## MINKGEM1 14.09381 264.87905 0.053 0.9576
## MINKGEM2 14.88888 264.87883 0.056 0.9552
## MINKGEM3 14.92099 264.87882 0.056 0.9551
## MINKGEM4 15.71555 264.87882 0.059 0.9527
## MINKGEM5 15.76304 264.87883 0.060 0.9525
## MINKGEM6 15.33140 264.87883 0.058 0.9538
## MINKGEM7 16.27836 264.87885 0.061 0.9510
## MINKGEM8 16.04392 264.87888 0.061 0.9517
## MINKGEM9 0.09645 388.01649 0.000 0.9998
## PBRAND1 -0.47576 0.17996 -2.644 0.0082 **
## PBRAND2 -1.23404 0.12810 -9.633 < 2e-16 ***
## PBRAND3 0.64429 0.06098 10.566 < 2e-16 ***
## PBRAND4 0.94664 0.05322 17.788 < 2e-16 ***
## PBRAND5 -0.02308 0.13423 -0.172 0.8635
## PBRAND6 -1.07605 0.18146 -5.930 3.03e-09 ***
## PBRAND7 -14.87397 406.05762 -0.037 0.9708
## PBRAND8 -14.73435 1455.39753 -0.010 0.9919
## PPERSAUT4 -14.73435 1455.39753 -0.010 0.9919
## PPERSAUT5 -0.20002 0.09303 -2.150 0.0315 *
## PPERSAUT6 1.41281 0.04754 29.718 < 2e-16 ***
## PPERSAUT7 -14.57002 204.32470 -0.071 0.9432
## PPERSAUT8 -14.49846 672.22843 -0.022 0.9828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177 on 10947 degrees of freedom
## Residual deviance: 12260 on 10925 degrees of freedom
## AIC: 12306
##
## Number of Fisher Scoring iterations: 14
predicted_6 = predict(glm_6, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_6 = ifelse(predicted_6>=0.5, 1, 0)
confusionMatrix(as.factor(predictedClass_6), as.factor(over_test$CARAVAN), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2595 86
## 1 1167 152
##
## Accuracy : 0.6868
## 95% CI : (0.6721, 0.7011)
## No Information Rate : 0.9405
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.105
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6387
## Specificity : 0.6898
## Pos Pred Value : 0.1152
## Neg Pred Value : 0.9679
## Prevalence : 0.0595
## Detection Rate : 0.0380
## Detection Prevalence : 0.3297
## Balanced Accuracy : 0.6642
##
## 'Positive' Class : 1
##
accuracy(predictedClass_6, as.numeric(over_test$CARAVAN))
## ME RMSE MAE MPE MAPE
## Test set 0.72975 0.879062 0.72975 68.925 68.925
pR2(glm_6)['McFadden']
## fitting null model for pseudo-r2
## McFadden
## 0.1922143
# difference in deviance = Null deviance (15177) - 12069 = 3108
# Accuracy 68%
# Sensitivity 69%
drewROC(glm_6)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
anova(logit.reg,model.2,test = 'Chisq')
## Analysis of Deviance Table
##
## Model 1: CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c + MGEMLEEF_c
## Model 2: CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV + MRELGE + MRELSA +
## MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF + MBERBOER + MBERMIDD +
## MBERARBG + MBERARBO + MSKC + MSKD + MHKOOP + MAUT1 + MAUT2 +
## MAUT0 + MINK3045 + MINK7512 + MINK123M + MKOOPKLA + PPERSAUT +
## PMOTSCO + PVRAAUT + PAANHANG + PWERKT + PWAOREG + PPLEZIER +
## AWAPART + AWALAND + ABROM + ALEVEN + APERSONG + AGEZONG +
## ABRAND + APLEZIER + AFIETS + ABYSTAND
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 10931 14375.5
## 2 10662 9775.8 269 4599.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model.3,glm_6,test = 'Chisq')
## Analysis of Deviance Table
##
## Model 1: CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA + MHKOOP
## Model 2: CARAVAN ~ MINKGEM + PBRAND + PPERSAUT
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 10879 11950
## 2 10925 12260 -46 -310.17 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1